library(readxl)
library(tidyverse)
set.seed(89)

#### DATA READ-IN ####

## SURVEY DATA

survey_data <- read_csv(
  file = "Data/Survey Results/7 April 2024/Ghana - Spillovers TB Wave II_April 7, 2024_10.37.csv"
)[-c(1,2,3,11,12,13,14),] %>% 
  filter(
    Q4.2 == "Yes, I certify that I am 18 years of age or over",
    Q4.3=="Yes, I agree to take part",
    as.Date(StartDate) >= "2024-02-19") %>% 
  
  select(
    -all_of(
      c("IPAddress","RecipientLastName","RecipientFirstName",
        "RecipientEmail","ExternalReference","LocationLatitude",
        "LocationLongitude","DistributionChannel","UserLanguage",
        
        "Q3.1","Q3.2","Q21.2","Q21.3","Q213","Q21.4","ID",
        
        "treatment")
      )
    )

## VARIABLE RENAME

var_rename <- read_csv("Admin/District files_TR/survey_variables.csv")

for (i in 1:nrow(var_rename)) {
  if (!(is.null(survey_data[[var_rename$variable[i]]]))) {
    colnames(survey_data)[colnames(survey_data) == var_rename$variable[i]] <- var_rename$new_variable[i]
  }
}

## CLINIC ATTENDANCE - matched list of respondents + household attendance at TB clinics

clinic_akwapim <- read_xlsx("Admin/District files_MS/NameMatching_Master File_Akwapim_MS.xlsx") %>% 
  filter(District != "Cape Coast Metro") %>% # some errant Cape Coasts (none have non-zero attendance)
  select("Response ID", "First Name", "Subject Attended the Clinic", "No of additional household members attended Clinic","...18")

colnames(clinic_akwapim) <- c("ResponseId", "First Name", "subj_attended", "household_attended","subj_reminder_confirmation")

clinic_cape <- read_csv("Admin/District files_MS/NameMatching_Master File_CapeCoast_MS.csv") %>% 
  select("ResponseId", "Q3.1", "Subject Attended the Clinic", "No of additional household members attended Clinic...11","Contact status")

colnames(clinic_cape) <- c("ResponseId", "First Name", "subj_attended", "household_attended","subj_reminder_confirmation")

clinic <- rbind(clinic_akwapim, clinic_cape) %>% 
  filter(!is.na(ResponseId)) %>% 
  mutate(household_attended = ifelse(is.na(household_attended), 0, household_attended),
         subj_attended = ifelse(is.na(subj_attended), 0, subj_attended)) %>% 
  select(-c("First Name"))

## COMMUNITY DATA -- data on overall attendance and population figures for communities

community_akwapim <- read_csv("Admin/District files_TR/awkapim_communities.csv")
community_cape <- read_csv("Admin/District files_TR/capecoast_communities.csv")

community <- rbind(community_akwapim, community_cape) %>% 
  select(-all_of(c("District","Sub District")))

#### MERGING ####

final_data <- survey_data %>% 
  left_join(clinic, by = "ResponseId") %>% 
  left_join(community, by = c("Community")) %>% 
  mutate(
    community_treatment = case_when(TBHealth == 1 ~ "TBHealth",
                               TBHealthPlusText == 1 ~ "TBHealthPlusText",
                               TBHealthPlus3 == 1 ~ "TBHealthPlus3",
                               TRUE ~ NA)
  ) %>% 
  
  mutate(
    screen_willing = case_when(!is.na(screen_willing_1) ~ screen_willing_1,
                               !is.na(screen_willing_2) ~ screen_willing_2,
                               !is.na(screen_willing_3) ~ screen_willing_3,
                               TRUE ~ NA),
    screen_day_1 = case_when(!is.na(screen_day_1_1) ~ screen_day_1_1,
                             !is.na(screen_day_2_1) ~ screen_day_2_1,
                             !is.na(screen_day_3_1) ~ screen_day_3_1,
                             TRUE ~ NA),
    screen_time_1 = case_when(!is.na(screen_time_1_1) ~ screen_time_1_1,
                              !is.na(screen_time_2_1) ~ screen_time_2_1,
                              !is.na(screen_time_3_1) ~ screen_time_3_1,
                              TRUE ~ NA),
    screen_day_2 = case_when(!is.na(screen_day_1_2) ~ screen_day_1_2,
                             !is.na(screen_day_2_2) ~ screen_day_2_2,
                             !is.na(screen_day_3_2) ~ screen_day_3_2,
                             TRUE ~ NA),
    screen_time_2 = case_when(!is.na(screen_time_1_2) ~ screen_time_1_2,
                              !is.na(screen_time_2_2) ~ screen_time_2_2,
                              !is.na(screen_time_3_2) ~ screen_time_3_2,
                              TRUE ~ NA),
    tb_interest = case_when(!is.na(tb_interest_1) ~ tb_interest_1,
                            !is.na(tb_interest_2) ~ tb_interest_2,
                            !is.na(tb_interest_3) ~ tb_interest_3,
                            TRUE ~ NA),
    tb_interest_other = case_when(!is.na(tb_interest_1_other) ~ tb_interest_1_other,
                                  !is.na(tb_interest_2_other) ~ tb_interest_2_other,
                                  !is.na(tb_interest_3_other) ~ tb_interest_3_other,
                                  TRUE ~ NA),
  ) %>% 
  
  select(
    -all_of(
      c(
        "screen_willing_1","screen_willing_2","screen_willing_3",
        "screen_day_1_1","screen_day_1_2","screen_day_2_1","screen_day_2_2","screen_day_3_1","screen_day_3_2",
        "screen_time_1_1","screen_time_1_2","screen_time_2_1","screen_time_2_2","screen_time_3_1","screen_time_3_2",
        "tb_interest_1","tb_interest_2","tb_interest_3",
        "tb_interest_1_other","tb_interest_2_other","tb_interest_3_other",
        "enum_name","gps_long","gps_lat",
        "contact_info"
        )
      )
    )

## Write anonymised final data
write_csv(final_data, "Blinding/ghana_ii_final_data_anon.csv")

## Codebook

codebook_template <- var_rename %>% 
  select(new_variable, description) %>% 
  filter(new_variable %in% colnames(final_data))

for (col in colnames(final_data)) {
  if (!(col %in% codebook_template$new_variable)) {
    codebook_template[nrow(codebook_template)+1,] <- list(col, "")
  }
}

# write_csv(codebook_template, "Blinding/codebook_template.csv")

#### BLINDING ####

set.seed(928734)

community_level <- c(
  "District","Sub District","Community",
  "verify_community",
  "TBHealth","TBHealthPlusText","TBHealthPlus3",
  "phone_provided","reminders_completed","community_p",
  "completed_p","population","community_attended","names_matched",
  "community_treatment"
)

comm_data <- final_data %>% 
  select(all_of(community_level)) %>% 
  mutate(verify_community = NA) %>% 
  distinct()

enum_data <- final_data[,c("enum_device","enum_id")]

geog_cols <- c("District", "Sub District", "Community", "verify_community")
treat_cols <- c("TBHealth","TBHealthPlusText","TBHealthPlus3","community_treatment",
                "phone_provided","reminders_completed","completed_p")
attend_cols <- c("community_p","population","community_attended")

blind_comm <- comm_data %>% select(all_of(geog_cols)) %>% 
  cbind(comm_data[sample(1:nrow(comm_data), nrow(comm_data)),] %>% 
              select(all_of(treat_cols))
        ) %>% 
  cbind(comm_data[sample(1:nrow(comm_data), nrow(comm_data)),] %>% 
          select(all_of(attend_cols))
  ) %>% 
  cbind(comm_data[sample(1:nrow(comm_data), nrow(comm_data)),] %>% 
          select(names_matched)
  ) %>% 
  select(-"verify_community")
  
blind_comms <- final_data %>% 
  select(all_of(geog_cols)) %>% 
  cbind(., enum_data) %>% 
  left_join(blind_comm, by = c("District", "Sub District", "Community"))


# individual level data

ind_pairs <- list(
  c("StartDate","EndDate","RecordedDate","Duration (in seconds)"),
  c("ethnicity","ethnicity_other"),
  c("villages_family","villages_family_n"),
  c("villages_friends","villages_friends_n"),
  c("whatsapp","whatsapp_freq",
    "whatsapp_close_family",
    "whatsapp_close_friends",
    "whatsapp_colleagues",
    "whatsapp_family_far",
    "whatsapp_neighbors",
    "whatsapp_church",
    "whatsapp_other",
    "whatsapp_no_groups"),
  c("sm_fb","sm_twitter",            
    "sm_insta","sm_reddit",
    "sm_youtube","sm_snapchat",      
    "sm_tiktok","sm_other",
    "sm_none","sm_freq"),
  c("cond_diab","cond_tb",
    "cond_covid","cond_malaria",
    "cond_yf","cond_bp",
    "cond_heart","cond_asthm",
    "cond_allergies","cond_kd",
    "cond_other","cond_none",
    "cond_dk","cond_pnts"),
  c("tb_interest","tb_interest_other"),
  c("n_household","household_attended")
)

ind_excl <- purrr::list_c(ind_pairs)

ind_data <- final_data %>% select(-all_of(community_level),-all_of(ind_excl))
paired_data <- final_data %>% select(all_of(ind_excl))

blind_ind <- ind_data

for (var in colnames(ind_data)) {
  
  blind_ind[[var]] <- sample(ind_data[[var]], nrow(ind_data))
  
}

blind_paired <- lapply(ind_pairs, function (p) {
  ind_tmp <- paired_data %>% select(all_of(p))
  return(ind_tmp[sample(1:nrow(ind_tmp), nrow(ind_tmp)),])
}) %>% 
  do.call("cbind",.)
  
blind_data <- cbind(blind_ind, blind_comms) %>% 
  cbind(blind_paired) %>% 
  select(all_of(colnames(final_data)))

write_csv(blind_data, "Blinding/blinded_data.csv")
